# general libraries
import pandas as pd
import numpy as np
# main viz libraries
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# ML process tools
from sklearn.model_selection import (train_test_split, StratifiedKFold, GridSearchCV, cross_validate,
cross_val_score, cross_val_predict)
from sklearn.pipeline import Pipeline
from sklearn import metrics
# ML Algo
from sklearn.linear_model import SGDClassifier
# utilities
from importlib import reload
from sklearn.datasets import fetch_openml
from IPython.display import display
# viz
from helpers import viz, ml_tools, preprocess
# Setting options
viz.change_theme('white')
seed = 42
def tight_layout(fig, tall=False):
height = 650 if tall else 400
fig.update_layout(width=780, height=height, margin=dict(b=80, l=20, r=20))
pio.show(fig)
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
features, target = mnist['data'], mnist['target'] # or fetch_openml('mnist_784', version=1, return_X_y=True)
print(mnist['DESCR'])
**Author**: Yann LeCun, Corinna Cortes, Christopher J.C. Burges **Source**: [MNIST Website](http://yann.lecun.com/exdb/mnist/) - Date unknown **Please cite**: The MNIST database of handwritten digits with 784 features, raw data available at: http://yann.lecun.com/exdb/mnist/. It can be split in a training set of the first 60,000 examples, and a test set of 10,000 examples It is a subset of a larger set available from NIST. The digits have been size-normalized and centered in a fixed-size image. It is a good database for people who want to try learning techniques and pattern recognition methods on real-world data while spending minimal efforts on preprocessing and formatting. The original black and white (bilevel) images from NIST were size normalized to fit in a 20x20 pixel box while preserving their aspect ratio. The resulting images contain grey levels as a result of the anti-aliasing technique used by the normalization algorithm. the images were centered in a 28x28 image by computing the center of mass of the pixels, and translating the image so as to position this point at the center of the 28x28 field. With some classification methods (particularly template-based methods, such as SVM and K-nearest neighbors), the error rate improves when the digits are centered by bounding box rather than center of mass. If you do this kind of pre-processing, you should report it in your publications. The MNIST database was constructed from NIST's NIST originally designated SD-3 as their training set and SD-1 as their test set. However, SD-3 is much cleaner and easier to recognize than SD-1. The reason for this can be found on the fact that SD-3 was collected among Census Bureau employees, while SD-1 was collected among high-school students. Drawing sensible conclusions from learning experiments requires that the result be independent of the choice of training set and test among the complete set of samples. Therefore it was necessary to build a new database by mixing NIST's datasets. The MNIST training set is composed of 30,000 patterns from SD-3 and 30,000 patterns from SD-1. Our test set was composed of 5,000 patterns from SD-3 and 5,000 patterns from SD-1. The 60,000 pattern training set contained examples from approximately 250 writers. We made sure that the sets of writers of the training set and test set were disjoint. SD-1 contains 58,527 digit images written by 500 different writers. In contrast to SD-3, where blocks of data from each writer appeared in sequence, the data in SD-1 is scrambled. Writer identities for SD-1 is available and we used this information to unscramble the writers. We then split SD-1 in two: characters written by the first 250 writers went into our new training set. The remaining 250 writers were placed in our test set. Thus we had two sets with nearly 30,000 examples each. The new training set was completed with enough examples from SD-3, starting at pattern # 0, to make a full set of 60,000 training patterns. Similarly, the new test set was completed with SD-3 examples starting at pattern # 35,000 to make a full set with 60,000 test patterns. Only a subset of 10,000 test images (5,000 from SD-1 and 5,000 from SD-3) is available on this site. The full 60,000 sample training set is available. Downloaded from openml.org.
print('DATASET KEYS: ', mnist.keys(), sep='\n', end='\n\n')
DATASET KEYS: dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])
Sample label
sample_index = 20
target[sample_index]
'4'
Features of the sample
one_sample = features[sample_index].reshape(28, 28)
fig = px.imshow(one_sample,
color_continuous_scale=px.colors.sequential.Greys)
fig.update_layout(coloraxis_showscale=False,
width=300, height=200,
xaxis=dict(visible=False),
yaxis=dict(visible=False),
margin=dict(t=10, b=10))
pio.show(fig)
The labels are encoded as string. Let's deal with this issue quickly by casting it with Series.astype() method.
target = target.astype(np.uint8)
target[sample_index]
4
Since it's already shuffled and splitted we can just assigned a subset to our train and test variables
X_train, X_test, y_train, y_test = features[:60000], features[60000:], target[:60000], target[60000:]
# quick func in case we need to reset our datasets
def reset_dataset():
global X_train
global X_test
global y_train
global y_test
X_train, X_test, y_train, y_test = features[:60000], features[60000:], target[:60000], target[60000:]
Let's simplify the problem by considering an algo that only discriminate fives. The output will be :
y_train = (y_train == 5)
y_test = (y_test == 5)
y_train
array([ True, False, False, ..., True, False, False])
Here we make use of the brilliant method cross_validate() that accepts a dictionary {'name':'metric'}
sgd_clf = SGDClassifier(random_state=seed)
scoring = dict(Accuracy='accuracy', Recall='recall', Precision='precision')
folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)
scores = cross_validate(estimator=sgd_clf,
X=X_train,
y=y_train,
cv=3,
scoring=scoring,
n_jobs=-1)
scores_strat = cross_validate(estimator=sgd_clf,
X=X_train,
y=y_train,
cv=folds, #just to check if by default it uses Strat ou just Kfold
scoring=scoring)
print('ACCURACY WITH KFOLDS', scores['test_Accuracy'].mean())
print('ACCURACY WITH STRAT KFOLDS', scores_strat['test_Accuracy'].mean())
ACCURACY WITH KFOLDS 0.9570333333333334 ACCURACY WITH STRAT KFOLDS 0.9503333333333334
note\
I let you check the recall which give an indication that the cv of the cross validation parameter doesnt trigger a stratified kfold by default. Weird, since I think I read several times that it triggered a strat kfold for classification and standard kfold for regression...
What would be the accuracy of an classifier that only output False ?
from sklearn.base import BaseEstimator
class DumbClassifier(BaseEstimator):
def fit(self, X, y=None):
return self
def predict(self, X):
return np.zeros((len(X), 1), dtype=bool)
never5_clf = DumbClassifier()
dumb_scores = cross_val_score(never5_clf,
X_train,
y_train,
cv=3,
scoring='accuracy')
print('CLASSIFIER THAT NEVER RETURNS FIVE.\nAccuracy = ', dumb_scores.mean())
CLASSIFIER THAT NEVER RETURNS FIVE. Accuracy = 0.90965
from sklearn.metrics import recall_score, precision_score, f1_score, confusion_matrix, classification_report
Each row is an actual class\ Each column is a predicted class.
TN | FP\ FN | TP
recall : true positive over all actual positives -> TPR (True Positive Rate)\ precision : true positive over all predicted positives
The precision score compile the number of TP by the number of all predicted positive :\ precision = TP/(TP + FP)
But if the classifier outputs one single Positive which is True and all others preds are Negatives (False and True):
precision = 1/(1+0) = 1 so the precision must be counter-balanced used the recall score:\
recall = TP/(TP + FN)
f1 : the harmonic mean of precision and recall, meaning that it gives more weight to low results. This is a convenient way of comparing classifiers but we need to keep in mind that if f1 favors classifiers with highest precision and recall it favors in the same time classifiers having approximately the same score for the two metrics and this is not what we always want.
We want to diminush the value of false positive so we want the max of TP over P : high precision\
-> note that it will probably be at the cost of several good video not selected (FN) so low recall
We want every ill person (positive) to be detected even if it means a lot of healthy people will be wrongly declared ill (FP). So we want the lower value possible for FN : hence the recall must be the highest
Let's code all of this
We need the preds to compiles the various metrics more precisely than with cross_val_score() or cross_validate(). We get it with cross_val_predict and use with the predicted array the builtin metrics of sklearn. We get the prediction on the train set of course !
preds = cross_val_predict(sgd_clf, X_train, y_train, cv=3)
cm = confusion_matrix(y_train, preds)
pr = precision_score(y_train, preds)
rec = recall_score(y_train, preds)
f_one = f1_score(y_train, preds)
print('Confusion Matrix', cm, 'Precision', pr, 'Recall', rec, 'f1', f_one, sep='\n' )
Confusion Matrix [[53892 687] [ 1891 3530]] Precision 0.8370879772350012 Recall 0.6511713705958311 f1 0.7325171197343846
ml_tools.cross_val_metrics(sgd_clf, X_train, y_train,
big_dataset=True) # under-sample the dataset for the ease of use
CROSS VALIDATION RESULTS OVER 8 SPLITS _______________________________________ [ACCURACY] 0.9598 (+/- 0.0039) [PRECISION] 0.861 (+/- 0.0499) [RECALL] 0.6714 (+/- 0.0828) [F1] 0.7488 (+/- 0.039)
report = classification_report(y_train, preds)
print(report)
precision recall f1-score support
False 0.97 0.99 0.98 54579
True 0.84 0.65 0.73 5421
accuracy 0.96 60000
macro avg 0.90 0.82 0.85 60000
weighted avg 0.95 0.96 0.95 60000
By trade-off I mean the way that the threshold used by the decision function of the model influences the output of the prediction. To see this influence, it's usefull to plot the recall and the precision depending on the various threshold values.
The method sklearn.metrics.precision_recall_curve return the used arrays for precisions, recalls and thresholds (in that order)
from sklearn.metrics import precision_recall_curve
# method='decision-function' mandatory to plot the recall/precision trade-off
y_scores = cross_val_predict(sgd_clf, X_train, y_train, cv=3,
method='decision_function')
# retrieve precision, recall and thresholds
precision, recall, thresholds = precision_recall_curve(y_train, y_scores)
General method:
y_scores = cross_val_predict(model, X_train, y_train, cv=folds,
method='decision_function'
But for XGBoost and other algo, we need to get y_scores with the method xgb_model.predict_proba(X)[:,1]
y_scores = xgbmodel.predict_proba(X)[:,1]
# of course it's waaaaay better with a cross validation workflow
y_scores = cross_val_predict(xgbmodel, X_train, y_train, cv=folds,
method='predict_proba')[:,1]
fig = viz.plot_precision_recall_thresholds(y_train, y_scores,
custom_threshold=-3500, # optionnal parameter to place a threshold
x_range=[-45_000,38_000]) # optionnal parameter to shrink the view
tight_layout(fig)
On peut manipuler le threshold utilisé par le modèle. Dans les exemples suivants on voit comment obtenir le meilleur recall pour une precision de 0.9 puis la meilleure precision pour un recall de 0.78.
y_scores est un array qui contient les scores avec la note issue de la method decision-function:
array([ 1200.93051237, -26883.79202424, -33072.03475406, ...,
13272.12718981, -7258.47203373, -16877.50840447])
Avec un treshold a n toutes les valeurs inférieures à n seront False (negatives) et toutes les valeurs supérieures ou égales seront True (positives)
precision donnée# np.argmax renvoie le premier index de l'array vérifiant la condition
np.argmax(precision >= 0.90)
# valeur du treshold pour precision >= n
thresholds[np.argmax(precision >= 0.90)]
# masque de l'array y_scores en fonction de ce threshold
(y_scores >= thresholds[np.argmax(precision >= 0.90)])
# formule condensée
cust_preds = (y_scores >= thresholds[np.argmax(precision >= 0.90)])
recall donnéLe signe change car la courbe du recall est décroissante
# formule condensée
cust_preds = (y_scores >= thresholds[np.argmax(recall <= 0.78)])
pred_90_precision = (y_scores >= thresholds[np.argmax(precision >= 0.90)])
print('pr = ', precision_score(y_train, pred_90_precision))
print('rec = ', recall_score(y_train, pred_90_precision))
pr = 0.9000345901072293 rec = 0.4799852425751706
pred_75_recall = (y_scores >= thresholds[np.argmax(recall <= 0.78)])
print('pr = ', precision_score(y_train, pred_75_recall))
print('rec = ', recall_score(y_train, pred_75_recall))
pr = 0.73453787352328 rec = 0.7799299022320605
This function returns the array of predictions and print the two metrics by default.
preds_75 = ml_tools.adjust_threshold(y_train, y_scores,
based_on='recall', # which metric to set
level_required=0.76) # at what level
Precision score: 0.7550870760769935 Recal score: 0.7598229109020476
We just need to get the various precision, recall and thresholds(unused here) like with the precision-recall trade-off plot and plot recall over precision. This curve is a straight forward representation of what it means to increase artificially the recall by moving the threshold.
It's just another way of seeing this relationship between the two metrics.
When to use it over the PR curve ?
# sklearn tools to get the arrays
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_train, y_scores)
ROC (receiver operating characteristics) is also meant to use with binary only classifiers. It plots the TPR (True Positive Rate aka recall) over the FPR (False Positive Rate) which is FP/len(negative-class).
Note;\
FPR = FP/(FP + TN)\
TNR = TN/(FP + TN) # aka sensibility\
donc FPR = 1 - TNR
It's important to notice that the FPR = (1 - sensitivity) and should tend to 0 (sensitivity -> 1). This is another view on this trade-off: the higher the TPR the lower the TNR (or the higher the FPR)
# sklearn tools to get the arrays
from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(y_train, y_scores) # _ = thresholds. They're not used to plot the classic roc_curve
fig = viz.plot_precision_recall_and_roc(y_train, y_scores)
tight_layout(fig)
Baseline metrics + curves : hover effects are pimped, you can try !
fig = viz.plot_binomial_clf_performance(sgd_clf, y_train, preds, y_scores)
tight_layout(fig, tall=True)
It' very straight forward. Just compare score. Tricky part is the choice of the scoring metric. Lets import and train a Random Forest Classifier.
from sklearn.ensemble import RandomForestClassifier
A visual assessment of the curve will be double check by the measurement of the area under the curve. Import this metrics also
from sklearn.metrics import roc_auc_score
Hiyaa, we can finally grab our results, plot everything, close the laptop and play football with kids.. ah but it's not as simple as we saw it in the last chapter. Like XGBoost and other ensemble algo Random Forest is not implementing a decision_function() method but a predict_proba() instead.
This predict_proba returns an array of dimension (m,n) such as
m : number of instancesn : number of classes This is where it's become interesting since the n columns are containing the proba that a giving instance belong to this class. So if we have an array like this:
display(preprocess.load_model('predict_proba_exemple'))
| pred_neg | pred_pos | |
|---|---|---|
| instance_0 | 0.817191 | 0.182809 |
| instance_1 | 0.439699 | 0.560301 |
| instance_2 | 0.250046 | 0.749954 |
| instance_3 | 0.990602 | 0.009398 |
| instance_4 | 0.776365 | 0.223635 |
and now we can simply take the column of positive prediction which is in fact the score we're aiming for :
all_probas = cross_val_predict(X, y, method='predict_proba')
y_scores = all_probas[:,1]
# or more simply, but also quite less elegant, of course the model need to be fitted :
y_scores = rndfor_model.predict_proba(X)[:,1]
rndfor_clf = RandomForestClassifier(n_estimators=100)
rndfor_clf_scores = cross_val_predict(rndfor_clf, X_train, y_train,
n_jobs=-1, cv=folds, method='predict_proba')[:,1]
# unused since i use a custom func but under the hood this is what is used to plot:
pr_rndfor, rec_rndfor, thr_rndfor = roc_curve(y_train, rndfor_clf_scores)
fig = viz.plot_precision_recall_and_roc(y_train, rndfor_clf_scores)
tight_layout(fig)
Since we're in the process of short selection an classifier, we're using a function that only compare:
This is more convenient to limit the comparison between only two model because a plot with more than two areas represented is not very readable. Any way let's plot! But before that we can structure a little bit our work. All arrays of probas or prediction, all variables of metrics, etc.. this a lot to remember. We can make use of a dataclass object that can store efficiently all of this and then access to what we want by the attribute of each model.
Here is the whole procedure:
# we can recreate the stratfold object
folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)
Since we need only the decision_function array (or the predict_proba depending on the model) we get both array by a cross_val_predict witht the right method argument passed.
sgd_preds = cross_val_predict(sgd_clf, X_train, y_train, cv=folds, n_jobs=-1)
sgd_probas = cross_val_predict(sgd_clf, X_train, y_train, cv=folds, n_jobs=-1,
method='decision_function')
then we create the object storing all we need.
baseline_model = ml_tools.create_binary_clf_dataclass('Baseline', sgd_clf, y_train, sgd_preds, sgd_probas)
Here this is a quick function that genereate our various metrics and populate a dataclass object. Nothing fancy here, the template is the following:
@dataclass(frozen=True, order=True, eq=True, unsafe_hash=True)
class ClassifierModel:
name: str = field(default='Base_name')
model: ClassifierMixin = field(default=ClassifierMixin, compare=False)
y: np.array = field(default=np.array, compare=False, repr=False)
scores: np.array = field(default=np.array, compare=False, repr=False)
fp_rates: np.array = field(default=np.array, compare=False, repr=False)
tp_rates: np.array = field(default=np.array, compare=False, repr=False)
thresholds: np.array = field(default=np.array, compare=False, repr=False)
auc: float = field(default=float, compare=True)
accuracy: float = field(default=float, compare=False)
precision: float = field(default=float, compare=False)
recall: float = field(default=float, compare=False)
f1: float = field(default=float, compare=False)
desc: str = field(default=f"Custom Object to store a model, and basic results", repr=True)
The resulting object is looking like this :
__repr__ can be customized, I choose to only display the metrics, the model parameters and the desc.self.auc only but it can be changed if you needobject.parameterdataclasses.asdict(object) to get the same object as a dictdataclasses.replace(obj, attribute=value) methodprint(f'{baseline_model = }')
baseline_model = ClassifierModel(name='Baseline', model=SGDClassifier(random_state=42), auc=0.952, accuracy=0.95, precision=0.698, recall=0.795, f1=0.743)
We can now create the object of the promising model:
rnd_preds = cross_val_predict(rndfor_clf, X_train, y_train, cv=3, n_jobs=-1)
rnd_probas = cross_val_predict(rndfor_clf, X_train, y_train, cv=folds, n_jobs=-1,
method='predict_proba')[:,1]
promising_model = ml_tools.create_binary_clf_dataclass('Baseline', rndfor_clf, y_train, rnd_preds, rnd_probas)
print(f'{promising_model = }')
promising_model = ClassifierModel(name='Baseline', model=RandomForestClassifier(), auc=0.999, accuracy=0.988, precision=0.991, recall=0.871, f1=0.927)
We can finally compare our models:
fig = viz.compare_two_classifiers(baseline_model, promising_model)
tight_layout(fig, tall=True)
Of course this is not a practical method to always compare two binary classifier, but it can be handy to be abble to display a clear and visual summary of the difference between two models. In the end, all this process can be done with theses lines of code:
# pipelining
from sklearn.pipeline import Pipeline
# data processing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# model evaluation
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate, StratifiedKFold, cross_val_predict
# classifier
from sklearn.neighbors import KNeighborsClassifier
# model structure and classifier implement
base_clf = Pipeline([
('pca', PCA(n_components=0.5, iterated_power=7)),
('scaler', StandardScaler()),
('clf', KNeighborsClassifier(n_jobs=-1))
])
# usefull variables
folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)
scoring = dict(recall='recall', precison='precision', f1='f1', accuracy='accuracy')
# auc
probas = cross_val_predict(base_clf, X_train, y_train, cv=folds, n_jobs=-1, method='predict_proba' )[:,1]
auc = roc_auc_score(y_train, probas)
# baseline metrics
base_clf_scores = cross_validate(base_clf, X_train, y_train, scoring=scoring, cv=folds, n_jobs=-1)
# consolidate model
base_clf_scores.update(dict(model=base_clf, auc=auc))